How to do it?:

Submission: Submit the link on Github of the assignment to Canvas


  1. Write the following function. Give examples to test your function.

Hint: Similar function

numeric_impute <- function(d) {
  for (i in 1:ncol(d)) {
    if (is.numeric(d[[i]])) {
      d[[i]][is.na(d[[i]])] <- mean(d[[i]], na.rm = TRUE)
    }
  }
  return(d)
}
#Test numeric_impute function.
library(tidyverse)
df  <- read_csv('adult_census.csv')

colSums(is.na(df))
##            age      workclass         fnlwgt      education  education.num 
##             30             34              0             15              0 
## marital.status     occupation   relationship           race            sex 
##             26             35              0              0             24 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              8              0              0             15              0
df1 <- numeric_impute(df)
colSums(is.na(df1))
##            age      workclass         fnlwgt      education  education.num 
##              0             34              0             15              0 
## marital.status     occupation   relationship           race            sex 
##             26             35              0              0             24 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              0              0              0             15              0

  1. Write the following function. Give examples to test your function.

Hint: Use If-statement to combine the function in Problem 1 and the function in this example

impute_na <- function(d) {
  for (i in 1:ncol(d)) {
    if (is.numeric(d[[i]])) {
      d[[i]][is.na(d[[i]])] <- mean(d[[i]], na.rm = TRUE)
    } else if (any(!is.na(d[[i]]))) {
      levels <- unique(d[[i]])
      mode_val <- levels[which.max(tabulate(match(d[[i]], levels)))]
      d[[i]][is.na(d[[i]])] <- mode_val
    }
  }
  return(d)
}
#Test impute_na function.
library(tidyverse)
df  <- read_csv('adult_census.csv')

colSums(is.na(df))
##            age      workclass         fnlwgt      education  education.num 
##             30             34              0             15              0 
## marital.status     occupation   relationship           race            sex 
##             26             35              0              0             24 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              8              0              0             15              0
dfNew <- impute_na(df)
colSums(is.na(dfNew))
##            age      workclass         fnlwgt      education  education.num 
##              0              0              0              0              0 
## marital.status     occupation   relationship           race            sex 
##              0              0              0              0              0 
##   capital.gain   capital.loss hours.per.week native.country         income 
##              0              0              0              0              0

  1. Write the following function. Give examples to test your function.

Hint: Similar function

bar_plot <- function(d) {
  library(ggplot2)
  for (i in 1:ncol(d)) {
    if (!is.numeric(d[[i]])) {
      print(ggplot(d, aes_string(x = names(d)[i])) +
            geom_bar() +
            labs(x = names(d)[i]))
    }
  }
}
#Test bar_plot function.
library(tidyverse)
df <- read_csv('adult_census.csv')

bar_plot(df)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#Test bar_plot function.
library(tidyverse)
df <- read_csv('adult_census.csv')

bar_plot(df)


  1. Write the following function. Give examples to test your function.

Hint: Similar function

bar_plots_filled <- function(d) {
  library(ggplot2)
  non_numeric_vars <- sapply(d, function(x) !is.numeric(x))
  non_numeric_cols <- names(d)[non_numeric_vars]
  
  for (col1 in non_numeric_cols) {
    for (col2 in non_numeric_cols) {
      if (col1 != col2) {
        plot_data <- d %>%
          group_by(!!sym(col1), !!sym(col2)) %>%
          summarise(count = n())
        
        print(ggplot(plot_data, aes(x = !!sym(col1), y = count, fill = !!sym(col2))) +
                geom_bar(position = "dodge", stat = "identity") +
                labs(x = col1, y = "Count", fill = col2) +
                theme_minimal())
      }
    }
  }
}
#Test the bar_plots_filled function

df <- read_csv('adult_census.csv')

bar_plots_filled(df)


  1. Write the following function. Give examples to test your function.

Hint: Combine this function, this function, and the function in Question 4. One way to combine is creating a new function, quick_plot, and call these three functions within quick_plot.

library(ggplot2)
library(dplyr)

bar_plots_filled <- function(d) {
  library(ggplot2)
  non_numeric_vars <- sapply(d, function(x) !is.numeric(x))
  non_numeric_cols <- names(d)[non_numeric_vars]
  
  for (col1 in non_numeric_cols) {
    for (col2 in non_numeric_cols) {
      if (col1 != col2) {
        plot_data <- d %>%
          group_by(!!sym(col1), !!sym(col2)) %>%
          summarise(count = n())
        
        print(ggplot(plot_data, aes(x = !!sym(col1), y = count, fill = !!sym(col2))) +
                geom_bar(position = "dodge", stat = "identity") +
                labs(x = col1, y = "Count", fill = col2) +
                theme_minimal())
      }
    }
  }
}


# Example usage:
# bar_plots_filled(your_data_frame)


# Example usage:
# bar_plots_filled(your_data_frame)


#Density Plots
density_plots <- function(d) {
  numeric_vars <- sapply(d, is.numeric)
  numeric_cols <- names(d)[numeric_vars]
  non_numeric_vars <- sapply(d, function(x) !is.numeric(x))
  non_numeric_cols <- names(d)[non_numeric_vars]
  
  for (num_col in numeric_cols) {
    for (non_num_col in non_numeric_cols) {
      print(ggplot(d, aes(x = !!sym(num_col), fill = !!sym(non_num_col))) +
              geom_density(alpha = 0.5) +
              labs(x = num_col, fill = non_num_col) +
              theme_minimal())
    }
  }
}

#Scatterplots
scatter_plots <- function(d) {
  numeric_vars <- sapply(d, is.numeric)
  numeric_cols <- names(d)[numeric_vars]
  non_numeric_vars <- sapply(d, function(x) !is.numeric(x))
  non_numeric_cols <- names(d)[non_numeric_vars]
  
  for (num_col1 in numeric_cols) {
    for (num_col2 in numeric_cols) {
      if (num_col1 != num_col2) {
        print(ggplot(d, aes(x = !!sym(num_col1), y = !!sym(num_col2))) +
                geom_point() +
                labs(x = num_col1, y = num_col2) +
                theme_minimal())
      }
    }
  }
}

#Function quick_plot
quick_plot <- function(d) {
  bar_plots_filled(d)
  density_plots(d)
  scatter_plots(d)
}
#Test quick_plot function

df <- read_csv('adult_census.csv')

suppressWarnings(quick_plot(df))